#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import annotations

import argparse
from pathlib import Path

import numpy as np
import pandas as pd


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--in", dest="in_path", default="media_project/out/gov_wechat_lowcarbon_city_month.csv")
    ap.add_argument("--out", dest="out_path", default="media_project/out/gov_wechat_lowcarbon_city_year.csv")
    args = ap.parse_args()

    in_path = Path(args.in_path)
    if not in_path.exists():
        raise SystemExit(f"Missing input: {in_path}. Run build_gov_wechat_topic_index.py first.")

    df = pd.read_csv(in_path, encoding="utf-8-sig")
    if df.empty:
        out_path = Path(args.out_path)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(out_path, index=False, encoding="utf-8-sig")
        print(f"Wrote {out_path} rows=0")
        return

    df["city_code6"] = df["city_code6"].astype(str).str.zfill(6)
    df["month"] = df["month"].astype(str)
    df["year"] = pd.to_numeric(df["month"].str.slice(0, 4), errors="coerce").astype("Int64")
    df["docs_total"] = pd.to_numeric(df["docs_total"], errors="coerce")
    df["docs_hit"] = pd.to_numeric(df["docs_hit"], errors="coerce")
    df["topic_intensity"] = pd.to_numeric(df.get("topic_intensity"), errors="coerce")

    df = df.dropna(subset=["city_code6", "year", "docs_total", "docs_hit"]).copy()

    # Year aggregation: keep both intensive and extensive margins.
    g = df.groupby(["city_code6", "year"], as_index=False).agg(
        docs_total=("docs_total", "sum"),
        docs_hit=("docs_hit", "sum"),
        months=("month", "nunique"),
        mean_topic_intensity=("topic_intensity", "mean"),
    )
    g["topic_intensity_year"] = g["docs_hit"] / g["docs_total"].replace({0: np.nan})

    out_path = Path(args.out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    g.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"Wrote {out_path} rows={len(g)} cities={g['city_code6'].nunique()} years={g['year'].nunique()}")


if __name__ == "__main__":
    main()

